import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
# Load in type 1 data of subject 127
df1 = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = df1.loc[(df1["type"] == 1) & (df1["id"] == 127)]
subjectData = subjectData[['roi', 'volume', 'type', 'level']]
subjectData.head()
| roi | volume | type | level | |
|---|---|---|---|---|
| 0 | Telencephalon_L | 531111 | 1 | 1 |
| 1 | Telencephalon_R | 543404 | 1 | 1 |
| 2 | Diencephalon_L | 9683 | 1 | 1 |
| 3 | Diencephalon_R | 9678 | 1 | 1 |
| 4 | Mesencephalon | 10268 | 1 | 1 |
# load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()
| roi | level4 | level3 | level2 | level1 | |
|---|---|---|---|---|---|
| 0 | SFG_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
| 1 | SFG_R | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R |
| 2 | SFG_PFC_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
| 3 | SFG_PFC_R | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R |
| 4 | SFG_pole_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
# Load subject type 1 level 5 data
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5)]
subjectData = subjectData[['roi', 'volume']]
# Merge the subject data with the multilevel data on roi
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()
| roi | volume | level4 | level3 | level2 | level1 | icv | comp | |
|---|---|---|---|---|---|---|---|---|
| 0 | SFG_L | 12926 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009350 |
| 1 | SFG_R | 10050 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.007270 |
| 2 | SFG_PFC_L | 12783 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009247 |
| 3 | SFG_PFC_R | 11507 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.008324 |
| 4 | SFG_pole_L | 3078 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.002227 |
# Create lists of labels of each level
icv_lst = subjectData['icv'].values.tolist()
l1 = subjectData['level1'].values.tolist()
l2 = subjectData['level2'].values.tolist()
l3 = subjectData['level3'].values.tolist()
l4 = subjectData['level4'].values.tolist()
roi_lst= subjectData['roi'].values.tolist()
all_nodes = icv_lst + l1 + l2 + l3
source_indices = [all_nodes.index(icv) for icv in subjectData.icv]\
+ [all_nodes.index(l1) for l1 in subjectData.level1]\
+ [all_nodes.index(l2) for l2 in subjectData.level2]
target_indices = [all_nodes.index(l1) for l1 in subjectData.level1]\
+ [all_nodes.index(l2) for l2 in subjectData.level2]\
+ [all_nodes.index(l3) for l3 in subjectData.level3]
values = subjectData['comp'].tolist() + subjectData['comp'].tolist() + subjectData['comp'].tolist()
# Create array of colors
colors = px.colors.qualitative.Prism
node_color_dict = dict([(node, np.random.choice(colors)) for node in all_nodes])
node_colors = [node_color_dict[node] for node in all_nodes]
link_colors = [node_color_dict[node] for node in subjectData.icv]\
+ [node_color_dict[node] for node in subjectData.level1]\
+ [node_color_dict[node] for node in subjectData.level2]
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 10,
label = all_nodes,
color = node_colors
),
link = dict(
source = source_indices,
target = target_indices,
value = values,
color = link_colors
))])
fig.update_layout(height=1200, width=1200, font_size=15)
fig.show()
Link to Sankey Diagram: https://ychen095.github.io/ds4ph-sp22-hw4/hw4.html
import sqlite3 as sq3
# Import the annual table
con = sq3.connect("opioid.db")
county_annual = pd.read_sql_query('select * from annual', con)
county_annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
# Update countyfips in Montgomery, AR to the correct value 5097
filt = (county_annual['BUYER_COUNTY'] == "MONTGOMERY") & (county_annual["BUYER_STATE"] == "AR")
county_annual.loc[filt,'countyfips'] = '05097'
county_annual[filt]
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 17429 | 17430 | MONTGOMERY | AR | 2006 | 469 | 175390 | 05097 |
| 17430 | 17431 | MONTGOMERY | AR | 2007 | 597 | 241270 | 05097 |
| 17431 | 17432 | MONTGOMERY | AR | 2008 | 561 | 251760 | 05097 |
| 17432 | 17433 | MONTGOMERY | AR | 2009 | 554 | 244160 | 05097 |
| 17433 | 17434 | MONTGOMERY | AR | 2010 | 449 | 247990 | 05097 |
| 17434 | 17435 | MONTGOMERY | AR | 2011 | 560 | 313800 | 05097 |
| 17435 | 17436 | MONTGOMERY | AR | 2012 | 696 | 339520 | 05097 |
| 17436 | 17437 | MONTGOMERY | AR | 2013 | 703 | 382300 | 05097 |
| 17437 | 17438 | MONTGOMERY | AR | 2014 | 491 | 396900 | 05097 |
# Remove rows that are missing county data
county_annual = county_annual[county_annual['BUYER_COUNTY'] != "NA"]
county_annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | ||
|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 |
# Create land_area dataframe with three columns from the land table: Areaname, STCOU and LND110210D
land = pd.read_sql_query('select * from land', con)
land_area = land[['Areaname', 'STCOU', 'LND110210D']]
# Rename column STCOU to countyfips
land_area = land_area.rename(columns={"STCOU": "countyfips"})
land_area.head()
| Areaname | countyfips | LND110210D | |
|---|---|---|---|
| 0 | UNITED STATES | 00000 | 3531905.43 |
| 1 | ALABAMA | 01000 | 50645.33 |
| 2 | Autauga, AL | 01001 | 594.44 |
| 3 | Baldwin, AL | 01003 | 1589.78 |
| 4 | Barbour, AL | 01005 | 884.88 |
# Import the population table and left join to land_area using countyfips
county_pop = pd.read_sql_query('select * from population', con)
county_info = county_pop.merge(land_area, how='left', on='countyfips')
county_info.head()
| BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | Areaname | LND110210D | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | AUTAUGA | AL | 01001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 | Autauga, AL | 594.44 |
| 1 | 2 | BALDWIN | AL | 01003 | 1 | 3 | Baldwin | Baldwin County, Alabama | B01003_001 | 2006 | 168121 | Baldwin, AL | 1589.78 |
| 2 | 3 | BARBOUR | AL | 01005 | 1 | 5 | Barbour | Barbour County, Alabama | B01003_001 | 2006 | 27861 | Barbour, AL | 884.88 |
| 3 | 4 | BIBB | AL | 01007 | 1 | 7 | Bibb | Bibb County, Alabama | B01003_001 | 2006 | 22099 | Bibb, AL | 622.58 |
| 4 | 5 | BLOUNT | AL | 01009 | 1 | 9 | Blount | Blount County, Alabama | B01003_001 | 2006 | 55485 | Blount, AL | 644.78 |
len(county_info)
28265
con.close
<function Connection.close>
Link to scatter plot: https://ychen095.github.io/ds4ph-sp22-hw4/hw4.html
pd.to_numeric(county_annual['DOSAGE_UNIT'])
county_annual = county_annual.assign(Pills_in_millions = pd.to_numeric(county_annual['DOSAGE_UNIT'])/1000000)
county_annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | Pills_in_millions | ||
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620 | 45001 | 0.36362 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940 | 45001 | 0.40294 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590 | 45001 | 0.42459 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230 | 45001 | 0.46723 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280 | 45001 | 0.53928 |
annual_grouped = county_annual.groupby(['BUYER_STATE', 'year'], as_index=False).mean('Pills_in_millions')
annual_grouped
| BUYER_STATE | year | Pills_in_millions | |
|---|---|---|---|
| 0 | AK | 2006 | 0.783350 |
| 1 | AK | 2007 | 0.909075 |
| 2 | AK | 2008 | 1.100459 |
| 3 | AK | 2009 | 1.120053 |
| 4 | AK | 2010 | 1.167372 |
| ... | ... | ... | ... |
| 495 | WY | 2010 | 0.819123 |
| 496 | WY | 2011 | 0.866664 |
| 497 | WY | 2012 | 0.928649 |
| 498 | WY | 2013 | 0.930924 |
| 499 | WY | 2014 | 0.940087 |
500 rows × 3 columns
px.scatter(annual_grouped, x='year', y='Pills_in_millions',
title='Average number of opioid pills by state',
labels={'Pills_in_millions': 'Number of pills in million'},
height=600)